Example: Monthly electricity sales for Virginia
Briefly characterize the dataset
library(arrow)
esales <- read_feather("esales.feather")
print(esales) # print the data as a table
summary(esales) # compute basic summary statistics about the data
value date year month
Min. : 7153 Min. :2001-01-01 Min. :2001 Min. : 1.000
1st Qu.: 8200 1st Qu.:2005-11-01 1st Qu.:2005 1st Qu.: 3.000
Median : 9019 Median :2010-09-01 Median :2010 Median : 6.000
Mean : 9093 Mean :2010-08-31 Mean :2010 Mean : 6.425
3rd Qu.: 9885 3rd Qu.:2015-07-01 3rd Qu.:2015 3rd Qu.: 9.000
Max. :11724 Max. :2020-05-01 Max. :2020 Max. :12.000
# References: https://www.tidyverse.org/, https://dplyr.tidyverse.org/
esales %>%
filter(year == 2019) %>%
filter(value > 9000) %>%
print()
esales %>%
group_by(month) %>%
summarise(mean = mean(value)) -> mean_esales_by_month
esales %>%
mutate(sales_TWh = value/1000) %>%
select(-value)
# filter(data object, condition) : syntax for filter() command
Plot the time series
#Reference: https://ggplot2.tidyverse.org/
ggplot(data=esales, aes(x=date,y=value)) +
geom_line() + xlab("Year") + ylab("Virginia monthly total electricity sales (GWh)")

Convert the data frame into a time series tsibble object
library(lubridate) # Make it easy to deal with dates
esales %>% filter(month==3) # These three lines of code
esales %>% filter(month(date)==3) # all do
esales %>% filter(lubridate::month(date)==3) # the same thing.
# We don't have to keep the 'year' and 'month' column: can recover them if needed
esales %>%
select(date, sales_GWh = value) -> esales_tbl
print(elsales_tbl)
# install.packages("tsibble")
library(tsibble) # Reference: https://tsibble.tidyverts.org/articles/intro-tsibble.html
esales_tbl %>% as_tsibble(index = date) -> elsales_tbl_ts
print(elsales_tbl_ts)
Make some plots
Make a histogram of monthly sales
hist(elsales_tbl_ts$sales_GWh, breaks=40)

Make a seasonal plot
elsales_tbl_ts %>%
feasts::gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")
# install.packages("feasts"), Reference: https://feasts.tidyverts.org/
library(feasts)
elsales_tbl_ts %>%
mutate(Month = tsibble::yearmonth(date)) %>%
as_tsibble(index = Month) %>%
select(Month,sales_GWh) -> vaelsales_tbl_ts
print(vaelsales_tbl_ts)
vaelsales_tbl_ts %>% gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")

vaelsales_tbl_ts %>%
gg_subseries(sales_GWh)

vaelsales_tbl_ts %>% filter(month(Month) %in% c(3,6,9,12)) %>% gg_lag(sales_GWh, lags = 1:2)

vaelsales_tbl_ts %>% filter(month(Month) == 1) %>% gg_lag(sales_GWh, lags = 1:2)

vaelsales_tbl_ts %>% ACF(sales_GWh) %>% autoplot()

# if(!('fpp3' %in% installed.packages())) install.packages('fpp3')
library(fpp3)
# decompose(vaelsales_tbl_ts)
vaelsales_tbl_ts %>%
model(STL(sales_GWh ~ trend(window=21) + season(window='periodic'), robust = TRUE)) %>%
components() %>%
autoplot()

vaelsales_tbl_ts %>%
mutate(ln_sales_GWh = log(sales_GWh)) %>%
model(STL(ln_sales_GWh ~ trend(window=21) + season(window='periodic'),
robust = TRUE)) %>%
components() %>%
autoplot()

vaelsales_tbl_ts %>%
features(sales_GWh, feat_stl)
vaelsales_tbl_ts %>%
features(sales_GWh, feature_set(pkgs=\feasts\))
Example: Australian production
# install.packages('tsibbledata')
library(tsibbledata)
aus_production
aus_production %>% gg_season(Electricity)
aus_production %>% gg_season(Beer)
Example: Gross Domestic Product data
Exploratory data analysis
library(tsibbledata) # Data sets package
print(global_economy)
global_economy %>% filter(Country==\Sweden\) %>% print()
# A tsibble: 58 x 9 [1Y]
# Key: Country [1]
Country Code Year GDP Growth CPI Imports Exports Population
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Sweden SWE 1960 14842870293. NA 9.21 23.4 23.0 7484656
2 Sweden SWE 1961 16147160123. 5.68 9.41 21.7 22.3 7519998
3 Sweden SWE 1962 17511477311. 4.26 9.86 21.4 21.9 7561588
4 Sweden SWE 1963 18954132366. 5.33 10.1 21.5 21.9 7604328
5 Sweden SWE 1964 21137242561. 6.82 10.5 21.9 22.3 7661354
6 Sweden SWE 1965 23260320646. 3.82 11.0 22.5 21.9 7733853
7 Sweden SWE 1966 25302033132. 2.09 11.7 21.9 21.4 7807797
8 Sweden SWE 1967 27463409202. 3.37 12.2 21.0 21.1 7867931
9 Sweden SWE 1968 29143383491. 3.64 12.5 21.6 21.6 7912273
10 Sweden SWE 1969 31649203886. 5.01 12.8 23.0 22.8 7968072
# … with 48 more rows
global_economy %>%
filter(Country==\Sweden\) %>%
autoplot(GDP) +
ggtitle(\GDP for Sweden\) + ylab(\$US billions\)

Fitting data to simple models
global_economy %>% model(trend_model = TSLM(GDP ~ trend())) -> fit
fit
fit %>% filter(Country == \Sweden\) %>% residuals()
fit %>% filter(Country == \Sweden\) %>% residuals() %>% autoplot(.resid)

Work with ln(GDP)
global_economy %>%
filter(Country==\Sweden\) %>%
autoplot(log(GDP)) +
ggtitle(\ln(GDP) for Sweden\) + ylab(\$US billions\)

global_economy %>%
model(trend_model = TSLM(log(GDP) ~ trend())) -> logfit
logfit %>% filter(Country == \Sweden\) %>% residuals() %>% autoplot()

global_economy %>% model(trend_model = TSLM(log(GDP) ~ log(Population))) -> fit3
fit3 %>% filter(Country == \Sweden\) %>% residuals() %>% autoplot()

Producing forecasts
fit %>% forecast(h = \3 years\) -> fcast3yrs
fcast3yrs
fcast3yrs %>% filter(Country == \Sweden\, Year == 2020) %>% str()
fable [1 × 5] (S3: fbl_ts/tbl_ts/tbl_df/tbl/data.frame)
$ Country: Factor w/ 263 levels \Afghanistan\,..: 232
$ .model : chr \trend_model\
$ Year : num 2020
$ GDP : dist [1:1]
..$ 3:List of 2
.. ..$ mu : num 5.45e+11
.. ..$ sigma: num 5.34e+10
.. ..- attr(*, \class\)= chr [1:2] \dist_normal\ \dist_default\
..@ vars: chr \GDP\
$ .mean : num 5.45e+11
- attr(*, \key\)= tibble [1 × 3] (S3: tbl_df/tbl/data.frame)
..$ Country: Factor w/ 263 levels \Afghanistan\,..: 232
..$ .model : chr \trend_model\
..$ .rows : list<int> [1:1]
.. ..$ : int 1
.. ..@ ptype: int(0)
..- attr(*, \.drop\)= logi TRUE
- attr(*, \index\)= chr \Year\
..- attr(*, \ordered\)= logi TRUE
- attr(*, \index2\)= chr \Year\
- attr(*, \interval\)= interval [1:1] 1Y
..@ .regular: logi TRUE
- attr(*, \response\)= chr \GDP\
- attr(*, \dist\)= chr \GDP\
- attr(*, \model_cn\)= chr \.model\
fcast3yrs %>%
filter(Country==\Sweden\) %>%
autoplot(global_economy) +
ggtitle(\GDP for Sweden\) + ylab(\$US billions\)

Model residuals vs. forecast errors
Model residuals:
Your data: \(y_1, y_2, \ldots, y_T\)
Fitted values: \(\hat{y}_1, \hat{y}_2, \ldots, \hat{y}_T\)
Model residuals: \(e_t = y_t - \hat{y}_t\)
Forecast errors:
augment(fit)
augment(fit) %>% filter(Country == \Sweden\) %>%
ggplot(aes(x = .resid)) +
geom_histogram(bins = 20) +
ggtitle(\Histogram of residuals\)

Example: GDP, several countries
library(tsibbledata) # Data sets package
nordic <- c(\Sweden\, \Denmark\, \Norway\, \Finland\)
(global_economy %>% filter(Country %in% nordic) -> nordic_economy)
nordic_economy %>% autoplot(GDP)

fitnord <- nordic_economy %>%
model(
trend_model = TSLM(GDP ~ trend()),
trend_model_ln = TSLM(log(GDP) ~ trend()),
ets = ETS(GDP ~ trend(\A\)),
arima = ARIMA(GDP)
)
fitnord
fitnord %>%
select(arima) %>%
coef()
Denmark: ARMA(1,1)
Finland: MA(2)
Norway: MA(1)
Sweden: MA(2)
nordic_economy %>%
model(arima_constrained = ARIMA(GDP ~ pdq(1,0,2))) %>% select(arima_constrained) %>% coef()
fitnord %>% coef()
fitnord %>% glance()
fitnord %>% filter(Country == \Denmark\) %>% select(arima) %>% report()
Series: GDP
Model: ARIMA(1,1,1)
Coefficients:
ar1 ma1
-0.3898 0.7240
s.e. 0.2061 0.1434
sigma^2 estimated as 2.407e+20: log likelihood=-1417.5
AIC=2840.99 AICc=2841.45 BIC=2847.12
fitnord %>%
accuracy() %>%
arrange(Country, MPE)
# ETS forecasts
USAccDeaths %>%
ets() %>%
forecast() %>%
autoplot()
str(taylor)
plot(taylor)
References
---
title:     "Exploratory analysis of time series data: Examples"
institute: "SYS 5581 Time Series & Forecasting, Spring 2021" 
author:     "Instructor: Arthur Small"
date:       "Version of `r Sys.Date()`"
output: 
  html_notebook:
    number_sections: true
    toc: yes
    toc_depth: 4
    code_folding: show # options: show, hide
    fig_caption: yes
  # html_document:
  #       keep_md: yes
  # pdf_document: default
bibliography: /Users/Arthur/GitRepos/Teaching/time-series/tseries.bib
link-citations: yes
---

```{r set up coding environment, include=FALSE, message=FALSE}
# library(dplyr) -- don't need this if you are loading the entire 'tidyverse' suite
library(tidyverse)
library(lubridate) # For easy handling of time-indexed objects

# if(!('fpp3' %in% installed.packages())) install.packages('fpp3')
library(fpp3)
```


# Example: Monthly electricity sales for Virginia

## Extract data from remote database

```{r open connection to database, eval=FALSE, include=FALSE}
# Open connection to a remote database
# Make sure your VPN network connection is active if needed!

# if(!('RPostgreSQL' %in% installed.packages())) install.packages('RPostgreSQL')
library(RPostgreSQL)

# "my_postgres_credentials.R" contains the log-in information
source("/Users/Arthur/GitRepos/Teaching/my_postgres_db_credentials.R")

# Open connection
db_driver <- dbDriver("PostgreSQL")
db <- dbConnect(db_driver,user=user, password=password,dbname="postgres", host=host)
rm(password) 

# check the connection: If function returns value TRUE, the connection is working
dbExistsTable(db, "metadata")
```

```{r retrieve data from db, eval=FALSE, message=FALSE}
esales <- dbGetQuery(db,'SELECT * from eia_elec_sales_va_all_m') # SQL code to retrieve data from a table in the remote database
# str(esales)
esales <- as_tibble(esales) # Convert dataframe to a 'tibble' for tidyverse work
# str(esales)
```

```{r save data in Apache Arrow format, eval=FALSE}
# Reference: https://arrow.apache.org/docs/r/
# if(!('arrow' %in% installed.packages())) install.packages('arrow')
library(arrow)
write_feather(esales, "esales.feather")
```

```{r close db connection, eval=FALSE}
# Close connection -- this is good practice
dbDisconnect(db)
dbUnloadDriver(db_driver)
```

## Briefly characterize the dataset

```{r read in data}
library(arrow)
esales <- read_feather("esales.feather")
```

```{r }
print(esales)    # print the data as a table
summary(esales)  # compute basic summary statistics about the data
```

```{r use tidyverse syntax to perform some simple data manipulations}
# References: https://www.tidyverse.org/, https://dplyr.tidyverse.org/

esales %>%
  filter(year == 2019) %>%
  filter(value > 9000) %>%
  print()

esales %>%
  group_by(month) %>%
  summarise(mean = mean(value)) -> mean_esales_by_month

esales %>%
  mutate(sales_TWh = value/1000) %>%
  select(-value)
  
# filter(data object, condition) : syntax for filter() command
```

## Plot the time series

```{r use ggplot2 to generate a plot}
#Reference: https://ggplot2.tidyverse.org/

ggplot(data=esales, aes(x=date,y=value)) + 
  geom_line() + xlab("Year") + ylab("Virginia monthly total electricity sales (GWh)")
```
## Convert the data frame into a time series `tsibble` object

```{r}
library(lubridate) # Make it easy to deal with dates

esales %>% filter(month==3)                   # These three lines of code
esales %>% filter(month(date)==3)             #   all do
esales %>% filter(lubridate::month(date)==3)  #   the same thing.

# We don't have to keep the 'year' and 'month' column: can recover them if needed

esales %>%
  select(date, sales_GWh = value) -> esales_tbl

print(elsales_tbl)
```

```{r}
# install.packages("tsibble")
library(tsibble) # Reference: https://tsibble.tidyverts.org/articles/intro-tsibble.html

esales_tbl %>% as_tsibble(index = date) -> elsales_tbl_ts

print(elsales_tbl_ts)
```

## Make some plots

###  Make a histogram of monthly sales

```{r make a histogram of the data}
hist(elsales_tbl_ts$sales_GWh, breaks=40)
```

###  Make a seasonal plot

```{r, eval=FALSE}
elsales_tbl_ts %>% 
  feasts::gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")
```

```{r}
# install.packages("feasts"), Reference: https://feasts.tidyverts.org/
library(feasts)

elsales_tbl_ts %>% 
  mutate(Month = tsibble::yearmonth(date)) %>% 
  as_tsibble(index = Month) %>%
  select(Month,sales_GWh) -> vaelsales_tbl_ts

print(vaelsales_tbl_ts)
```

```{r , warning=FALSE}
vaelsales_tbl_ts %>% gg_season(sales_GWh, labels = "both") + ylab("Virginia electricity sales (GWh)")
```


```{r}
vaelsales_tbl_ts %>% 
  gg_subseries(sales_GWh)
```

```{r plot lagged values}
vaelsales_tbl_ts  %>% filter(month(Month) %in% c(3,6,9,12)) %>% gg_lag(sales_GWh, lags = 1:2)

vaelsales_tbl_ts  %>% filter(month(Month) == 1) %>% gg_lag(sales_GWh, lags = 1:2)
```

```{r}
vaelsales_tbl_ts %>% ACF(sales_GWh) %>% autoplot()
```

```{r perform automated time series decomposition}


# decompose(vaelsales_tbl_ts)
```

```{r perform additive STL decomposition of the VA electricity sales time series}
vaelsales_tbl_ts %>%
  model(STL(sales_GWh ~ trend(window=21) + season(window='periodic'), robust = TRUE)) %>%
  components() %>%
  autoplot()
```

```{r perform multiplicative STL decomposition of the VA electricity sales time series}
vaelsales_tbl_ts %>%
  mutate(ln_sales_GWh = log(sales_GWh)) %>%
  model(STL(ln_sales_GWh ~ trend(window=21) + season(window='periodic'),
    robust = TRUE)) %>%
  components() %>%
  autoplot()
```

```{r}
vaelsales_tbl_ts %>%
  features(sales_GWh, feat_stl)
```

```{r}
vaelsales_tbl_ts %>%
  features(sales_GWh, feature_set(pkgs="feasts"))
```

# Example: Australian production

```{r, warning=FALSE}
# install.packages('tsibbledata')
library(tsibbledata)

aus_production

aus_production %>% gg_season(Electricity)

aus_production %>% gg_season(Beer)
```

# Example: Gross Domestic Product data

## Exploratory data analysis

```{r}
library(tsibbledata) # Data sets package

print(global_economy)
```

```{r}
global_economy %>% filter(Country=="Sweden") %>% print()
```

```{r}
global_economy %>%
  filter(Country=="Sweden") %>%
  autoplot(GDP) +
  ggtitle("GDP for Sweden") + ylab("$US billions")
```

## Fitting data to simple models

```{r}
global_economy %>% model(trend_model = TSLM(GDP ~ trend())) -> fit

fit


```

```{r}
fit %>% filter(Country == "Sweden") %>% residuals()
```

```{r}

fit %>% filter(Country == "Sweden") %>% residuals() %>% autoplot(.resid)
```

### Work with ln(GDP)

```{r}
global_economy %>%
  filter(Country=="Sweden") %>%
  autoplot(log(GDP)) +
  ggtitle("ln(GDP) for Sweden") + ylab("$US billions")
```

```{r}
global_economy %>%
  model(trend_model = TSLM(log(GDP) ~ trend())) -> logfit
```

```{r}
logfit %>% filter(Country == "Sweden") %>% residuals() %>% autoplot()
```

```{r}
global_economy %>% model(trend_model = TSLM(log(GDP) ~ log(Population))) -> fit3

fit3 %>% filter(Country == "Sweden") %>% residuals() %>% autoplot()

```

# Producing forecasts

```{r}
fit %>% forecast(h = "3 years") -> fcast3yrs

fcast3yrs

```

```{r}

fcast3yrs %>% filter(Country == "Sweden", Year == 2020) %>% str()
```

```{r visualize forecasts}
fcast3yrs %>% 
  filter(Country=="Sweden") %>%
  autoplot(global_economy) +
  ggtitle("GDP for Sweden") + ylab("$US billions")
```

## Model residuals vs. forecast errors

Model residuals:

Your data: $y_1, y_2, \ldots, y_T$

Fitted values: $\hat{y}_1, \hat{y}_2, \ldots, \hat{y}_T$

Model residuals: $e_t = y_t - \hat{y}_t$

Forecast errors:

```{r}
augment(fit)
```

```{r}
augment(fit) %>% filter(Country == "Sweden") %>%
  ggplot(aes(x = .resid)) +
  geom_histogram(bins = 20) +
  ggtitle("Histogram of residuals")
```

## Are the model residuals auto-correlated?

```{r}
augment(fit) %>% filter(Country == "Sweden") -> augSweden

augSweden %>%
  ACF(.resid) %>%
  autoplot() + ggtitle("ACF of residuals")
```

```{r}
augment(fit3) %>% filter(Country == "Sweden") -> augSweden3

augSweden3 %>%
  ACF(.resid) %>%
  autoplot() + ggtitle("ACF of residuals")
```


# Example: GDP, several countries


```{r}
library(tsibbledata) # Data sets package

nordic <- c("Sweden", "Denmark", "Norway", "Finland")

(global_economy %>% filter(Country %in% nordic) -> nordic_economy)

```

```{r}
nordic_economy %>% autoplot(GDP)
```

```{r}
fitnord <- nordic_economy %>%
  model(
    trend_model = TSLM(GDP ~ trend()),
    trend_model_ln = TSLM(log(GDP) ~ trend()),
    ets = ETS(GDP ~ trend("A")),
    arima = ARIMA(GDP)
  )

fitnord
```

```{r}
fitnord %>%
  select(arima) %>%
  coef()
```


Denmark: ARMA(1,1)

Finland: MA(2)

Norway: MA(1)

Sweden: MA(2)

```{r}
nordic_economy %>%
  model(arima_constrained = ARIMA(GDP ~ pdq(1,0,2))) %>% select(arima_constrained) %>% coef()
```

```{r}
fitnord %>% coef() 
```

```{r}
fitnord %>%  glance()  
```

```{r}
fitnord %>% filter(Country == "Denmark") %>% select(arima) %>% report()
```

```{r}
fitnord %>%
  accuracy() %>%
  arrange(Country, MPE)
```



```{r, eval=FALSE}
# ETS forecasts
USAccDeaths %>%
  ets() %>%
  forecast() %>%
  autoplot()
```

```{r, eval=FALSE}
str(taylor)
plot(taylor)
```



# References
